import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
%matplotlib inline
plt.style.use('fivethirtyeight')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
data = pd.read_csv('ml_project1_data_pt.csv',delimiter =',')
data.head(3)
data.shape
data.columns
# Checking for null values.
info = pd.DataFrame(data=data.isnull().sum()).T.rename(index={0:'Null values'})
info = info.append(pd.DataFrame(data=data.isnull().sum()/data.shape[0] * 100).T.rename(index={0:'% Null values'}))
info
# Checking for Duplicates :
data.duplicated().sum()
data.describe()
data['dt_primcomp'] = pd.to_datetime(data['dt_primcomp'], errors='coerce')
data['dt_primcomp'] = data['dt_primcomp'].dt.strftime('%m/%Y')
data['age']= 2020 - data['ano_nasc']
data['renda_mes_media']= data['renda_ano']/12
data['campaing_engagement'] =( data['Cmp1']+data['Cmp2']+data['Cmp3']+data['Cmp4']+data['Cmp5']) /5
data['target'] = data['target'].astype(str)
data['Cmp1'] = data['Cmp1'].astype(str)
data['Cmp2'] = data['Cmp2'].astype(str)
data['Cmp3'] = data['Cmp3'].astype(str)
data['Cmp4'] = data['Cmp4'].astype(str)
data['Cmp5'] = data['Cmp5'].astype(str)
data['reclamacoes'] = data['reclamacoes'].astype(str)
data['digital_profile'] = '0'
data['digital_profile'][(data['num_visit_web_ult_mes']< 5) & (data['promocoes_web']<3)]='1'
data.dtypes.groupby(data.dtypes).size()
dtypes = pd.DataFrame(data.dtypes.rename('type')).reset_index().astype('str')
dtypes = dtypes.query('index != "dt_primcomp"',)
dtypes = dtypes.query('index != "ID"')
dtypes = dtypes.query('index != "target"')
numeric = dtypes[(dtypes.type.isin(['int64', 'float64']))]['index'].values
categorical = dtypes[~(dtypes['index'].isin(numeric)) & (dtypes['index'] != 'target')]['index'].values
print('Numeric:\n', numeric)
print('Categorical:\n', categorical)
pylab.rcParams['figure.figsize'] = (6.0, 4.0)
for attr in categorical:
figsize=(8,4)
plt.figure()
data[attr].value_counts().plot(kind='bar', color='steelblue');
plt.title(attr);
for attr in categorical:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,4))
outcome_counts = data.groupby([attr, 'target']).size().rename('count').reset_index()
by_outcome = outcome_counts.pivot(columns='target', values='count', index=attr)
# Plot the proportions
by_outcome.div(by_outcome.sum(1), axis=0).plot.bar(stacked=True, ax=ax1);
# Plot the counts
data[attr].value_counts().plot.bar(ax=ax2, legend=False,color='steelblue');
print('Support (%s)\n' % attr)
print(data[attr].value_counts(), '\n')
plt.title(attr);
data['dt_primcomp'] = pd.to_datetime(data['dt_primcomp'], infer_datetime_format=True)
data.groupby('dt_primcomp')['ID'].nunique().plot(kind='bar')
plt.show()
data['target'] = data['target'].astype(int)
data.groupby('dt_primcomp')['target'].sum().plot(kind='bar')
plt.show()
# campaing
pylab.rcParams['figure.figsize'] = (28, 3)
data.groupby(('dt_primcomp'))['Cmp1','Cmp2','Cmp3','Cmp4','Cmp5'].sum().plot(kind='bar')
plt.title("Campaing Success")
plt.figure( figsize=(20, 18))
plt.show()
data[numeric].hist(figsize=(18,15));
data[numeric].describe()
plt.figure(figsize=(16,12));
sns.heatmap(data[numeric].corr('spearman'), annot=True);
cust_attrs = ['age', 'renda_mes_media', 'num_visit_web_ult_mes','target']
data['target'] = data['target'].astype(str)
numeric_outcome = pd.concat([data[numeric], data['target']], axis=1)
sns.pairplot(numeric_outcome[cust_attrs].sample(n=100), hue='target', aspect=1.2);
1 . Kmeans
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
X=data[['renda_ano', 'crianca_casa', 'adoles_casa', 'recencia_dias', 'vinho_montante', 'frutas_montante', 'carne_montante', 'peixe_montante', 'doces_montante', 'ouro_montante', 'promocoes_desconto', 'promocoes_web', 'promocoes_catalogo', 'promocoes_store', 'num_visit_web_ult_mes','age', 'renda_mes_media','campaing_engagement']]
X=X.fillna(0)
# define standard scaler
scaler = StandardScaler()
# transform data
scaled = scaler.fit_transform(X)
print(scaled)
for n_clusters in range(3, 10):
kmeans = KMeans(init='k-means++', n_clusters = n_clusters, n_init = 30)
kmeans.fit(scaled)
clusters = kmeans.predict(scaled)
sil_avg = silhouette_score(scaled, clusters)
print("For n_clusters : ", n_clusters, "The average silhouette_score is : ", sil_avg)
# Choosing number of clusters as 3:
# Trying Improving the silhouette_score :
n_clusters = 3
sil_avg = -1
while sil_avg < 0.145:
kmeans = KMeans(init = 'k-means++', n_clusters = n_clusters, n_init = 30)
kmeans.fit(scaled)
clusters = kmeans.predict(scaled)
sil_avg = silhouette_score(scaled, clusters)
print("For n_clusters : ", n_clusters, "The average silhouette_score is : ", sil_avg)
# Printing number of elements in each cluster :
pd.Series(clusters).value_counts()
def graph_component_silhouette(n_clusters, lim_x, mat_size, sample_silhouette_values, clusters):
import matplotlib as mpl
mpl.rc('patch', edgecolor = 'dimgray', linewidth = 1)
fig, ax1 = plt.subplots(1, 1)
fig.set_size_inches(8, 8)
ax1.set_xlim([lim_x[0], lim_x[1]])
ax1.set_ylim([0, mat_size + (n_clusters + 1) * 10])
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhoutte_values = sample_silhouette_values[clusters == i]
ith_cluster_silhoutte_values.sort()
size_cluster_i = ith_cluster_silhoutte_values.shape[0]
y_upper = y_lower + size_cluster_i
ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhoutte_values, alpha = 0.8)
ax1.text(-0.03, y_lower + 0.5 * size_cluster_i, str(i), color = 'red', fontweight = 'bold',
bbox = dict(facecolor = 'white', edgecolor = 'black', boxstyle = 'round, pad = 0.3'))
y_lower = y_upper + 10
# Plotting the intra cluster silhouette distances.
from sklearn.metrics import silhouette_samples
sample_silhouette_values = silhouette_samples(scaled, clusters)
graph_component_silhouette(n_clusters, [-0.07, 0.33], len(X), sample_silhouette_values, clusters)
PCA
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(scaled)
pca_samples = pca.transform(scaled)
# Checking the amount of variance explained :
fig, ax = plt.subplots(figsize=(14, 5))
sns.set(font_scale=1)
plt.step(range(scaled.shape[1]), pca.explained_variance_ratio_.cumsum(), where = 'mid', label = 'Cummulative Variance Explained')
sns.barplot(np.arange(1, scaled.shape[1] + 1), pca.explained_variance_ratio_, alpha = 0.5, color = 'g',
label = 'Individual Variance Explained')
plt.xlim(0, 20)
plt.xticks(rotation = 45, fontsize = 16)
ax.set_xticklabels([s for s in ax.get_xticklabels()])
plt.ylabel("Explained Variance", fontsize = 18)
plt.xlabel("Principal Components", fontsize = 18)
plt.legend(loc = 'upper right', fontsize = 12)
plt.show()
data['fit_segmentacao']= kmeans.labels_
data
data['fit_segmentacao'] = data['fit_segmentacao'].astype(str)
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title="Data Profile Report")
profile